{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# /usr/bin/env python3\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file = './data/Train_Data.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(train_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>text</th>\n",
       "      <th>unknownEntities</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>83dcefb7</td>\n",
       "      <td>揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...</td>\n",
       "      <td>揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...</td>\n",
       "      <td>趣步</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1ad5be0d</td>\n",
       "      <td>企业纳税贷额度，全国小微企业都可做！</td>\n",
       "      <td>{IMG:1}{IMG:2}公司张总说：“没想到缴税还能办贷款，本来我们还在为准备纳税证明、...</td>\n",
       "      <td>西部助贷</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6dd28e9b</td>\n",
       "      <td>一线|新控股股东入主后联讯证券拟改名为知识城证券</td>\n",
       "      <td>腾讯新闻《》作者刘鹏成功接下47.24%股权入主具有30年历史的联讯证券后，广州开发区金融控...</td>\n",
       "      <td>广州开发区金融控股集团有限公司</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>f3b61b38</td>\n",
       "      <td>度小满金融：年中黄金情绪指数拉升最快</td>\n",
       "      <td>{IMG:1}近日，度小满金融在首次发布的《2019年上半年国民投资理财情绪指数报告》中提到...</td>\n",
       "      <td>科创板基金;度小满金融</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>84b12bae</td>\n",
       "      <td>区块链技术创造新价值！纷享车链赋能传统智能车载硬件设备</td>\n",
       "      <td>区块链技术创造新价值！纷享车链赋能传统智能车载硬件设备时间：2018年10月22日信息来源：...</td>\n",
       "      <td>七巧智盒3in1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1db87a14</td>\n",
       "      <td>一年不到，40多万的收入，有谁不想做？关键是长久，稳定，持续收入。瑞联盟，全国支付第二交易额...</td>\n",
       "      <td>一年不到，40多万的收入，有谁不想做？关键是长久，稳定，持续收入。瑞联盟，全国支付第二交易额...</td>\n",
       "      <td>瑞联盟</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id                                              title  \\\n",
       "0  83dcefb7  揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...   \n",
       "1  1ad5be0d                                 企业纳税贷额度，全国小微企业都可做！   \n",
       "2  6dd28e9b                           一线|新控股股东入主后联讯证券拟改名为知识城证券   \n",
       "3  f3b61b38                                 度小满金融：年中黄金情绪指数拉升最快   \n",
       "4  84b12bae                        区块链技术创造新价值！纷享车链赋能传统智能车载硬件设备   \n",
       "5  1db87a14  一年不到，40多万的收入，有谁不想做？关键是长久，稳定，持续收入。瑞联盟，全国支付第二交易额...   \n",
       "\n",
       "                                                text  unknownEntities  \n",
       "0  揭秘趣步骗局，趣步是什么，趣步是怎么赚钱的？趣步公司可靠吗？趣步合法吗？相信是众多小伙伴最关...               趣步  \n",
       "1  {IMG:1}{IMG:2}公司张总说：“没想到缴税还能办贷款，本来我们还在为准备纳税证明、...             西部助贷  \n",
       "2  腾讯新闻《》作者刘鹏成功接下47.24%股权入主具有30年历史的联讯证券后，广州开发区金融控...  广州开发区金融控股集团有限公司  \n",
       "3  {IMG:1}近日，度小满金融在首次发布的《2019年上半年国民投资理财情绪指数报告》中提到...      科创板基金;度小满金融  \n",
       "4  区块链技术创造新价值！纷享车链赋能传统智能车载硬件设备时间：2018年10月22日信息来源：...         七巧智盒3in1  \n",
       "5  一年不到，40多万的收入，有谁不想做？关键是长久，稳定，持续收入。瑞联盟，全国支付第二交易额...              瑞联盟  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head(6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max length of titles:\t193\n",
      "Max length of texts:\t31819\n"
     ]
    }
   ],
   "source": [
    "# title的最大长度/text的最大长度\n",
    "max_title_len = max([len(a) for a in list(df_train['title']) if type(a) is not float])\n",
    "print('Max length of titles:\\t{}'.format(max_title_len))\n",
    "max_text_len = max([len(a) for a in list(df_train['text']) if type(a) is not float])\n",
    "print('Max length of texts:\\t{}'.format(max_text_len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# text的行数统计\n",
    "text_lines = [len(a.split()) for a in list(df_train['text']) if type(a) is not float]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['优客百度']\n",
      "[]\n",
      "[]\n",
      "147\n",
      "140\n",
      "128\n"
     ]
    }
   ],
   "source": [
    "# 检查常见实体会不会作为查询目标\n",
    "# 检查实体中有没有叫百度的\n",
    "entities_freq = [a for a in list(df_train['unknownEntities']) if type(a) is not float and a.find('百度')>0]\n",
    "print(entities_freq)\n",
    "# 检查实体中有没有叫腾讯的\n",
    "entities_freq = [a for a in list(df_train['unknownEntities']) if type(a) is not float and a.find('腾讯')>0]\n",
    "print(entities_freq)\n",
    "# 检查实体中有没有叫阿里的\n",
    "entities_freq = [a for a in list(df_train['unknownEntities']) if type(a) is not float and a.find('阿里')>0]\n",
    "print(entities_freq)\n",
    "\n",
    "# 检查文本中有没有叫百度的\n",
    "entities_freq = [a for a in list(df_train['text']) if type(a) is not float and a.find('百度')>0]\n",
    "print(len(entities_freq))\n",
    "# 检查文本中有没有叫腾讯的\n",
    "entities_freq = [a for a in list(df_train['text']) if type(a) is not float and a.find('腾讯')>0]\n",
    "print(len(entities_freq))\n",
    "# 检查文本中有没有叫阿里的\n",
    "entities_freq = [a for a in list(df_train['text']) if type(a) is not float and a.find('阿里')>0]\n",
    "print(len(entities_freq))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'区块链技术创造新价值！纷享车链赋能传统智能车载硬件设备时间：2018年10月22日信息来源：头条新闻网作者：佚名【字体：大中小】?伴随着区块链技术不断渗透至实体行业，区块链与出行领域的融合价值也正日益显现，行业中出现了一些规模化的落地项目，纷享车链就是其中逐渐崭露头角的项目之一。纷享车链旨在打造一个具有广泛信任共识的汽车出行行业开放式数据应用平台，不仅用于支撑用户私密数据积累和管理的高频交互，且提供给第三方进行可信契约开发和应用。而通过区块链技术赋能出行行业，打造基于信任的去中心化应用平台，通过可信设备所采集的可信数据是关键所在。?????为此，纷享车链前瞻性地制定了“可信物联网设备数据采集标准”，用于实现对第三方设备厂商的支持——任何符合标准的物联网车载智能硬件设备均可以接入平台，并在确权去耦后上传至数据保险柜中。作为收集出行数据的重要行业标准，纷享车链首款官方定制的可信设备——七巧智盒3in1已于8月初已经正式面市。????{IMG:1}???七巧智盒3in1具备车行赚钱+车损保障+车况检测三大功能。第一，七巧智盒3in1是纷享车链多币种矿机。使用产品后，每一次的开车出行数据都能获得相应的Token奖励。第二，七巧智盒3in1套装版本内含特别定制的“车辆贬值补偿计划”。车辆互助保障是对车辆因车损导致的贬值进行的额外补偿，除车损商业险外，其贬值补偿可高达10W。第三，七巧智盒3in1爱车监控功能能对近百种车况数据进行详尽展示和分析。?????纷享车链除了制定行业标准，率先研发车行可信设备，更提供了标准化的接入方案——通过技术扶持、资金扶持和用户扶持等方法赋能有志于加入生态的物联网品牌，打造纷享车链“可信物联网设备联盟”，逐步赋能各类车载智能设备，盘活各大硬件设备厂商。目前，纷享车链已经和多个硬件设备厂商展开生态合作，包括车智杰、思科尔特、山东思建、贵阳广电等，合作智能车载硬件设备包括车载OBD设备、行车记录仪、车载空气净化器、车载充电器、车载广告屏等。厂商表示，通过区块链赋能传统智能车载硬件设备，不仅能够给予原有的设备赋予“行车赚钱”这样的区块链功能，更能通过区块链技术有效解决原本行业中存在的“数据安全”等问题，赋予老产品新的价值和生命。?????对于车主而言，随着传统智能车载硬件设备具备区块链属性的“挖矿功能”，车主可以在驾车过程中通过使用产品完成数据的积累从而赢得财富——用户通过开车所得到的驾驶里程、驾驶时间等有效数据来获得纷享车链的token奖励，原本传统的车载硬件设备通过区块链赋能摇身一变将成为一台具有金融属性的“车载矿机”。相较传统矿机动辄千元万元的高昂售价，通过纷享车链赋能后的车载智能车载硬件矿机售价远远低于传统矿机的售价。完善的功能、挖矿的乐趣、源源不断的产出让更多车主享受到区块链带来的红利。??????随着空港出行领域私家车分享服务商AirCnC、汽车互助保障品牌车车助、老牌租车企业车速递等多家企业入驻了纷享车链完成应用落地，这足以证明纷享车链的市场认可度以及生态体系的日趋成熟。随之而来的是更多传统智能硬件设备厂商加入到纷享车链打造的“可信物联网设备联盟”，利用区块链技术逐步赋能各类车载、可穿戴等物联网设备，打破原有行业瓶颈，为商业发展注入新动力。?免责声明：本文仅代表企业观点，与经济网无关。其原创性以及文中陈述文字和内容未经本站证实，对本文以及其中全部或者部分内容、文字的真实性、完整性、及时性本站不作任何保证或承诺，请读者仅作参考，并请自行核实相关内容。如对内容有疑问请联系9651262@qq.com文章热词：上一篇：“明润视康”视力矫正，潜力无限的朝阳产业下一篇：投资心理误区知多少——心态决定成败分享按钮'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 文本中似乎有评论形式的文本，抽查下具体形式\n",
    "# 提取出文本中包含{}的\n",
    "text_include_braces = [a for a in list(df_train['text']) if type(a) is not float and a.find('{')>0]\n",
    "text_include_braces[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
